{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark import SparkContext\n",
    "from pyspark.sql import SparkSession"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql.types import *\n",
    "sc=SparkContext(appName=\"DataFrame_d2\") \n",
    "spark=SparkSession.builder.appName('d2').getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "staff=[(1,'mike',30,'finance',24000),(2,'lee',34,'develop',36000),(3,'allen',36,'manager',40000),(4,'jane',None,'CFO',None)] \n",
    "staff_schema = StructType([\n",
    "        StructField(\"id\", IntegerType(), True),    \n",
    "        StructField(\"name\", StringType(), True),\n",
    "        StructField(\"age\", IntegerType(), True),\n",
    "        StructField(\"job\", StringType(), True),\n",
    "        StructField(\"salary\",LongType(),True)\n",
    "])\n",
    "staff_df=spark.createDataFrame(staff,staff_schema)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "user=[(1,'mike','BeiJin','朝阳'),(2,None,'ShangHai','徐汇'),(3,'allen','GuangZhou','天河'),(4,'jane','ShenZhen','福田')]\n",
    "user_schema=StructType([\n",
    "        StructField(\"id\", IntegerType(), True),    \n",
    "        StructField(\"name\", StringType(), True),\n",
    "        StructField(\"city\", StringType(), True),\n",
    "        StructField(\"region\", StringType(), True)\n",
    "])\n",
    "user_df=spark.createDataFrame(user,user_schema)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "cross_data=staff_df.crossJoin(user_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+-----+----+-------+------+---+-----+---------+------+\n",
      "| id| name| age|    job|salary| id| name|     city|region|\n",
      "+---+-----+----+-------+------+---+-----+---------+------+\n",
      "|  1| mike|  30|finance| 24000|  1| mike|   BeiJin|æé³|\n",
      "|  1| mike|  30|finance| 24000|  2| null| ShangHai|å¾æ±|\n",
      "|  1| mike|  30|finance| 24000|  3|allen|GuangZhou|å¤©æ²³|\n",
      "|  1| mike|  30|finance| 24000|  4| jane| ShenZhen|ç¦ç°|\n",
      "|  2|  lee|  34|develop| 36000|  1| mike|   BeiJin|æé³|\n",
      "|  2|  lee|  34|develop| 36000|  2| null| ShangHai|å¾æ±|\n",
      "|  2|  lee|  34|develop| 36000|  3|allen|GuangZhou|å¤©æ²³|\n",
      "|  2|  lee|  34|develop| 36000|  4| jane| ShenZhen|ç¦ç°|\n",
      "|  3|allen|  36|manager| 40000|  1| mike|   BeiJin|æé³|\n",
      "|  3|allen|  36|manager| 40000|  2| null| ShangHai|å¾æ±|\n",
      "|  3|allen|  36|manager| 40000|  3|allen|GuangZhou|å¤©æ²³|\n",
      "|  3|allen|  36|manager| 40000|  4| jane| ShenZhen|ç¦ç°|\n",
      "|  4| jane|null|    CFO|  null|  1| mike|   BeiJin|æé³|\n",
      "|  4| jane|null|    CFO|  null|  2| null| ShangHai|å¾æ±|\n",
      "|  4| jane|null|    CFO|  null|  3|allen|GuangZhou|å¤©æ²³|\n",
      "|  4| jane|null|    CFO|  null|  4| jane| ShenZhen|ç¦ç°|\n",
      "+---+-----+----+-------+------+---+-----+---------+------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "cross_data.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "except_data=user_df.select('id','name').exceptAll(staff_df.select('id','name'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+----+\n",
      "| id|name|\n",
      "+---+----+\n",
      "|  2|null|\n",
      "+---+----+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "except_data.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+-----+\n",
      "| id| name|\n",
      "+---+-----+\n",
      "|  4| jane|\n",
      "|  3|allen|\n",
      "|  1| mike|\n",
      "+---+-----+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "user_df.select('id','name').intersectAll(staff_df.select('id','name')).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----+------+---------+\n",
      "| name|salary|     city|\n",
      "+-----+------+---------+\n",
      "| mike| 24000|   BeiJin|\n",
      "|allen| 40000|GuangZhou|\n",
      "| jane|  null| ShenZhen|\n",
      "|  lee| 36000| ShangHai|\n",
      "+-----+------+---------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "staff_df.join(user_df,staff_df.id==user_df.id,'left').select(staff_df.name,staff_df.salary,user_df.city).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(min(salary)=24000)]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "staff_df.agg({'salary':'min'}).collect()\n",
    "from pyspark.sql import functions as F\n",
    "staff_df.agg(F.min(staff_df.salary)).collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "t=staff_df.select(staff_df.age.cast(DoubleType()),staff_df.salary.cast(DoubleType()) )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9714027646697837"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t.corr('age','salary')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9714027646697837"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "staff_df.corr('age','salary')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "staff_df.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "294666.6666666667"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "staff_df.cov('age','salary')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "294666.6666666667"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t.cov('age','salary')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pyspark.sql.group.GroupedData"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(staff_df.cube('age'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------------------+-----+------------------+-------+------------------+\n",
      "|summary|                id| name|               age|    job|            salary|\n",
      "+-------+------------------+-----+------------------+-------+------------------+\n",
      "|  count|                 4|    4|                 3|      4|                 3|\n",
      "|   mean|               2.5| null|33.333333333333336|   null|33333.333333333336|\n",
      "| stddev|1.2909944487358056| null| 3.055050463303893|   null|  8326.66399786453|\n",
      "|    min|                 1|allen|                30|    CFO|             24000|\n",
      "|    max|                 4| mike|                36|manager|             40000|\n",
      "+-------+------------------+-----+------------------+-------+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "staff_df.describe().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(avg(id)=2.5, avg(age)=33.333333333333336, avg(salary)=33333.333333333336)]"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "staff_df.groupBy().avg().collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------------------+-----+------------------+-------+------------------+\n",
      "|summary|                id| name|               age|    job|            salary|\n",
      "+-------+------------------+-----+------------------+-------+------------------+\n",
      "|  count|                 4|    4|                 3|      4|                 3|\n",
      "|   mean|               2.5| null|33.333333333333336|   null|33333.333333333336|\n",
      "| stddev|1.2909944487358056| null| 3.055050463303893|   null|  8326.66399786453|\n",
      "|    min|                 1|allen|                30|    CFO|             24000|\n",
      "|    max|                 4| mike|                36|manager|             40000|\n",
      "+-------+------------------+-----+------------------+-------+------------------+\n",
      "\n",
      "+---+-----+----+-------+------+\n",
      "| id| name| age|    job|salary|\n",
      "+---+-----+----+-------+------+\n",
      "|  3|allen|  36|manager| 40000|\n",
      "|  4| jane|null|    CFO|  null|\n",
      "|  2|  lee|  34|develop| 36000|\n",
      "|  1| mike|  30|finance| 24000|\n",
      "+---+-----+----+-------+------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "staff_df.describe().show()\n",
    "staff_df.sort('name','salary').show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+---+-----+---+-------+------+\n",
      "|summary| id| name|age|    job|salary|\n",
      "+-------+---+-----+---+-------+------+\n",
      "|  count|  4|    4|  3|      4|     3|\n",
      "|    min|  1|allen| 30|    CFO| 24000|\n",
      "|    25%|  1| null| 30|   null| 24000|\n",
      "|    75%|  3| null| 36|   null| 40000|\n",
      "|    max|  4| mike| 36|manager| 40000|\n",
      "+-------+---+-----+---+-------+------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "staff_df.summary(\"count\", \"min\", \"25%\", \"75%\", \"max\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- id: integer (nullable = true)\n",
      " |-- name: string (nullable = true)\n",
      " |-- age: integer (nullable = true)\n",
      " |-- job: string (nullable = true)\n",
      " |-- salary: long (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "staff_df.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_df_copy=user_df.alias('user_df_copy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(id=1, name=u'mike', city=u'BeiJin', region=u'\\xe6\\x9c\\x9d\\xe9\\x98\\xb3'),\n",
       " Row(id=3, name=u'allen', city=u'GuangZhou', region=u'\\xe5\\xa4\\xa9\\xe6\\xb2\\xb3'),\n",
       " Row(id=4, name=u'jane', city=u'ShenZhen', region=u'\\xe7\\xa6\\x8f\\xe7\\x94\\xb0')]"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_df_copy.dropna().collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(id=1, name=u'mike', city=u'BeiJin', region=u'\\xe6\\x9c\\x9d\\xe9\\x98\\xb3'),\n",
       " Row(id=2, name=u'anonymous', city=u'ShangHai', region=u'\\xe5\\xbe\\x90\\xe6\\xb1\\x87'),\n",
       " Row(id=3, name=u'allen', city=u'GuangZhou', region=u'\\xe5\\xa4\\xa9\\xe6\\xb2\\xb3'),\n",
       " Row(id=4, name=u'jane', city=u'ShenZhen', region=u'\\xe7\\xa6\\x8f\\xe7\\x94\\xb0')]"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_df_copy.fillna('anonymous').collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(id=2, name=None, city=u'ShangHai', region=u'\\xe5\\xbe\\x90\\xe6\\xb1\\x87')]"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_df.filter(user_df.name.isNull()).collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_null=user_df.filter(user_df.name.isNull())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(name=None, city=u'ShangHai', region=u'\\xe5\\xbe\\x90\\xe6\\xb1\\x87')]"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_null.select('name','city','region')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_na=user_df_copy.na"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(id=1, name=u'mike', city=u'BeiJin', region=u'\\xe6\\x9c\\x9d\\xe9\\x98\\xb3'),\n",
       " Row(id=2, name=u'anonymous', city=u'ShangHai', region=u'\\xe5\\xbe\\x90\\xe6\\xb1\\x87'),\n",
       " Row(id=3, name=u'allen', city=u'GuangZhou', region=u'\\xe5\\xa4\\xa9\\xe6\\xb2\\xb3'),\n",
       " Row(id=4, name=u'jane', city=u'ShenZhen', region=u'\\xe7\\xa6\\x8f\\xe7\\x94\\xb0')]"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_na.fill('anonymous').collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "== Physical Plan ==\n",
      "*(1) Filter isnull(name#11)\n",
      "+- Scan ExistingRDD[id#10,name#11,city#12,region#13]\n"
     ]
    }
   ],
   "source": [
    "user_null.explain()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
    "staff_foreach=staff_df.filter(\"id!=4\").foreach(lambda x : x.salary * 1.5 )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(name=u'mike', (salary * 2)=48000),\n",
       " Row(name=u'lee', (salary * 2)=72000),\n",
       " Row(name=u'allen', (salary * 2)=80000),\n",
       " Row(name=u'jane', (salary * 2)=None)]"
      ]
     },
     "execution_count": 107,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "staff_df.selectExpr('name','salary *2').collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrame[name: string, (city + -cn): double]"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_df.selectExpr('name',\"city + '-cn' \")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(id=1, name=u'mike', city=u'BeiJin', region=u'\\xe6\\x9c\\x9d\\xe9\\x98\\xb3'),\n",
       " Row(id=2, name=None, city=u'ShangHai', region=u'\\xe5\\xbe\\x90\\xe6\\xb1\\x87'),\n",
       " Row(id=3, name=u'allen', city=u'GuangZhou', region=u'\\xe5\\xa4\\xa9\\xe6\\xb2\\xb3'),\n",
       " Row(id=4, name=u'jane', city=u'ShenZhen', region=u'\\xe7\\xa6\\x8f\\xe7\\x94\\xb0')]"
      ]
     },
     "execution_count": 122,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(user_df.toLocalIterator())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<itertools.chain at 0x9bbbbe0>"
      ]
     },
     "execution_count": 123,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_df.toLocalIterator()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(id=1, name=u'mike', age=30, job=u'finance', salary=24000, salary_tax=1200.0),\n",
       " Row(id=2, name=u'lee', age=34, job=u'develop', salary=36000, salary_tax=1800.0),\n",
       " Row(id=3, name=u'allen', age=36, job=u'manager', salary=40000, salary_tax=2000.0),\n",
       " Row(id=4, name=u'jane', age=None, job=u'CFO', salary=None, salary_tax=None)]"
      ]
     },
     "execution_count": 124,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "staff_df.withColumn('salary_tax',staff_df.salary * 0.05).collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(id=1, name=u'mike', age=30, job=u'finance', salary=25200.0),\n",
       " Row(id=2, name=u'lee', age=34, job=u'develop', salary=37800.0),\n",
       " Row(id=3, name=u'allen', age=36, job=u'manager', salary=42000.0),\n",
       " Row(id=4, name=u'jane', age=None, job=u'CFO', salary=None)]"
      ]
     },
     "execution_count": 125,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "staff_df.withColumn('salary',staff_df.salary * 1.05).collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(id=1, name=u'mike', age=30, job=u'finance', salary_taxed=24000),\n",
       " Row(id=2, name=u'lee', age=34, job=u'develop', salary_taxed=36000),\n",
       " Row(id=3, name=u'allen', age=36, job=u'manager', salary_taxed=40000),\n",
       " Row(id=4, name=u'jane', age=None, job=u'CFO', salary_taxed=None)]"
      ]
     },
     "execution_count": 126,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "staff_df.withColumnRenamed('salary','salary_taxed').collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Row(id=1, name=u'mike', city=u'BeiJin', region=u'\\xe6\\x9c\\x9d\\xe9\\x98\\xb3')\n",
      "Row(id=2, name=None, city=u'ShangHai', region=u'\\xe5\\xbe\\x90\\xe6\\xb1\\x87')\n",
      "Row(id=3, name=u'allen', city=u'GuangZhou', region=u'\\xe5\\xa4\\xa9\\xe6\\xb2\\xb3')\n",
      "Row(id=4, name=u'jane', city=u'ShenZhen', region=u'\\xe7\\xa6\\x8f\\xe7\\x94\\xb0')\n"
     ]
    }
   ],
   "source": [
    "for x in user_df.toLocalIterator():\n",
    "    print x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_json=user_df.toJSON()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pyspark.rdd.RDD"
      ]
     },
     "execution_count": 130,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(user_json)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [],
   "source": [
    "staff_pd=staff_df.toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pandas.core.frame.DataFrame"
      ]
     },
     "execution_count": 132,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(staff_pd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrame[c1: int, c2: string, c3: int, c4: string, c5: bigint]"
      ]
     },
     "execution_count": 135,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "staff_df.toDF('c1','c2','c3','c4','c5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [],
   "source": [
    "staff_df_copy=staff_df.alias('staff_copy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [],
   "source": [
    "staff_all=staff_df.union(staff_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+-----+----+-------+------+\n",
      "| id| name| age|    job|salary|\n",
      "+---+-----+----+-------+------+\n",
      "|  1| mike|  30|finance| 24000|\n",
      "|  2|  lee|  34|develop| 36000|\n",
      "|  3|allen|  36|manager| 40000|\n",
      "|  4| jane|null|    CFO|  null|\n",
      "|  1| mike|  30|finance| 24000|\n",
      "|  2|  lee|  34|develop| 36000|\n",
      "|  3|allen|  36|manager| 40000|\n",
      "|  4| jane|null|    CFO|  null|\n",
      "+---+-----+----+-------+------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "staff_all.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [],
   "source": [
    "group_name=staff_all.groupBy(staff_all.name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(name=u'allen', count=2), Row(name=u'jane', count=2)]"
      ]
     },
     "execution_count": 141,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "group_name.count().head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(name=u'allen', avg(id)=3.0, avg(age)=36.0, avg(salary)=40000.0),\n",
       " Row(name=u'jane', avg(id)=4.0, avg(age)=None, avg(salary)=None)]"
      ]
     },
     "execution_count": 143,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "group_name.avg().head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql.functions import pandas_udf, PandasUDFType"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9714027646697837"
      ]
     },
     "execution_count": 147,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "staff_all.stat.corr('age','salary')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(name_freqItems=[u'mike', u'jane', u'allen', u'lee'], job_freqItems=[u'manager', u'CFO', u'finance', u'develop'])]"
      ]
     },
     "execution_count": 154,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "staff_all.freqItems(['name','job']).collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------+-----+-----+-----+----+\n",
      "|name_salary|24000|36000|40000|null|\n",
      "+-----------+-----+-----+-----+----+\n",
      "|       mike|    2|    0|    0|   0|\n",
      "|       jane|    0|    0|    0|   2|\n",
      "|      allen|    0|    0|    2|   0|\n",
      "|        lee|    0|    2|    0|   0|\n",
      "+-----------+-----+-----+-----+----+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "staff_all.stat.crosstab('name','salary').show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+-----+---+-------+------+\n",
      "| id| name|age|    job|salary|\n",
      "+---+-----+---+-------+------+\n",
      "|  1| mike| 30|finance| 24000|\n",
      "|  2|  lee| 34|develop| 36000|\n",
      "|  3|allen| 36|manager| 40000|\n",
      "|  1| mike| 30|finance| 24000|\n",
      "|  2|  lee| 34|develop| 36000|\n",
      "|  3|allen| 36|manager| 40000|\n",
      "+---+-----+---+-------+------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "staff_all.na.drop().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+-----+---+-------+------+\n",
      "| id| name|age|    job|salary|\n",
      "+---+-----+---+-------+------+\n",
      "|  1| mike| 30|finance| 24000|\n",
      "|  2|  lee| 34|develop| 36000|\n",
      "|  3|allen| 36|manager| 40000|\n",
      "|  4| jane| -1|    CFO|    -1|\n",
      "|  1| mike| 30|finance| 24000|\n",
      "|  2|  lee| 34|develop| 36000|\n",
      "|  3|allen| 36|manager| 40000|\n",
      "|  4| jane| -1|    CFO|    -1|\n",
      "+---+-----+---+-------+------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "staff_all.na.fill(-1).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = spark.createDataFrame([('2019-10-31','Thur')], ['date','week'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------+----+\n",
      "|      date|week|\n",
      "+----------+----+\n",
      "|2019-10-31|Thur|\n",
      "+----------+----+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
